notebook.community

Edit and run



In [1]:

    
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

Read in the data...



In [2]:

    
raw_data = pd.read_csv("/Users/Frankie/Documents/Dissertation/Data/Twitter/twitter_all.csv",\
                   names = ["symptom", "tweet"], encoding = 'iso-8859-1').fillna('Control')

raw_data[:5]









    Out[2]:






  
    
      
      symptom
      tweet
    
  
  
    
      0
      Control
      I wish these medications did not have these si...
    
    
      1
      Control
      @Mariah2you no nothing else has worked and if ...
    
    
      2
      Control
      Dr said no Benlysta yet, upped my Celcept, and...
    
    
      3
      Control
      Tired of being tired. I may have to say yes to...
    
    
      4
      Control
      @Mariah2you BenLysta is if this doesn't work.

Filter for Pain and Control tweets.



In [3]:

    
# remove duplicate tweets and filter out tweets just the tweets relating to pain
raw_data = raw_data.drop_duplicates()
pain = raw_data[(raw_data.symptom == 'Pain')]

# get a sample of control tweets
control = raw_data[(raw_data.symptom == 'Control')].sample(pain.shape[0])

raw_data = pd.concat((pain,control))

raw_data[:5]









    Out[3]:






  
    
      
      symptom
      tweet
    
  
  
    
      23
      Pain
      @MeriLizzie @Trepe I started on tramadol but t...
    
    
      182
      Pain
      First vaccine for Hepatitus E (kills 70,000/yr...
    
    
      203
      Pain
      Flu vaccine stalled: It kills thousands of Can...
    
    
      205
      Pain
      @stephsaid_wahh why? look at diet soda, aspart...
    
    
      210
      Pain
      NGO's administers Hepatitis B vaccine, kills t...



In [4]:

    
# need to reset the indices 
raw_data.to_csv("/Users/Frankie/Documents/Dissertation/Data/Twitter/preprocessed/temp_df_nodup2.csv",index=False,encoding = 'iso-8859-1')
raw_data = pd.read_csv("/Users/Frankie/Documents/Dissertation/Data/Twitter/preprocessed/temp_df_nodup2.csv", encoding = 'iso-8859-1')

raw_data[:5]









    Out[4]:






  
    
      
      symptom
      tweet
    
  
  
    
      0
      Pain
      @MeriLizzie @Trepe I started on tramadol but t...
    
    
      1
      Pain
      First vaccine for Hepatitus E (kills 70,000/yr...
    
    
      2
      Pain
      Flu vaccine stalled: It kills thousands of Can...
    
    
      3
      Pain
      @stephsaid_wahh why? look at diet soda, aspart...
    
    
      4
      Pain
      NGO's administers Hepatitis B vaccine, kills t...

Convert the words into the features.



In [5]:

    
# use the scikit learn CountVectorizer function to preprocess the data
# parameters are set to:
# > binary - converts words to 0s and 1s
# > min_df - excludes any words in less that 10 documents
# > analyzer - count words not characters
# > ngram_range - extract uni and bigrams
features = raw_data["tweet"].as_matrix()
vec = CountVectorizer( binary = True, min_df = 10/features.shape[0], analyzer = 'word', ngram_range=(1, 2))

# transform the data
data_features = vec.fit_transform(features)
vocab = vec.get_feature_names()



In [6]:

    
data_features = pd.DataFrame(data_features.toarray(), columns = vocab)
data_features[:5]









    Out[6]:






  
    
      
      10
      12
      19
      19 year
      2nd
      30
      __
      __ __
      about
      about the
      ...
      young
      your
      your flu
      your pain
      youtube
      ä_
      ä_ ä_
      ès
      ê_
      ê__
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 942 columns

Convert the symptom into the label column.



In [7]:

    
raw_data['label'] = np.where(raw_data['symptom']=='Control', 0, 1)
label_df = raw_data.drop(['tweet'], axis=1)
label_df[:5]

Merge label and features.



In [8]:

    
data_final = pd.merge(label_df, data_features, left_index=True, right_index=True)
data_final[:5]









    Out[8]:






  
    
      
      symptom
      label
      10
      12
      19
      19 year
      2nd
      30
      __
      __ __
      ...
      young
      your
      your flu
      your pain
      youtube
      ä_
      ä_ ä_
      ès
      ê_
      ê__
    
  
  
    
      0
      Pain
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      Pain
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      Pain
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      Pain
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      Pain
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 944 columns

Save the file.



In [9]:

    
data_final.drop(['symptom'], axis=1).to_csv("/Users/Frankie/Desktop/pain.csv",index=False, header = False)

	symptom	tweet
0	Control	I wish these medications did not have these si...
1	Control	@Mariah2you no nothing else has worked and if ...
2	Control	Dr said no Benlysta yet, upped my Celcept, and...
3	Control	Tired of being tired. I may have to say yes to...
4	Control	@Mariah2you BenLysta is if this doesn't work.

	symptom	tweet
23	Pain	@MeriLizzie @Trepe I started on tramadol but t...
182	Pain	First vaccine for Hepatitus E (kills 70,000/yr...
203	Pain	Flu vaccine stalled: It kills thousands of Can...
205	Pain	@stephsaid_wahh why? look at diet soda, aspart...
210	Pain	NGO's administers Hepatitis B vaccine, kills t...

	10	12	19	19 year	2nd	30	__	__ __	about	about the	...	young	your	your flu	your pain	youtube	ä_	ä_ ä_	ès	ê_	ê__
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	10	12	19	19 year	2nd	30	__	__ __	about	about the	...	young	your	your flu	your pain	youtube	ä_	ä_ ä_	ès	ê_	ê__
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	10	12	19	19 year	2nd	30	__	__ __	about	about the	...	young	your	your flu	your pain	youtube	ä_	ä_ ä_	ès	ê_	ê__
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0